From: Yves-Alexis Perez <corsac@debian.org>
Date: Fri, 2 Feb 2018 08:46:54 +0000 (+0100)
Subject: Revert "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes"
X-Git-Tag: archive/raspbian/4.9.80-2+rpi1~8^2~1
X-Git-Url: https://dgit.raspbian.org/%22http://www.example.com/cgi/%22/%22http:/www.example.com/cgi/%22?a=commitdiff_plain;h=45a8a21fe75fb28099e07b2ddda4b9e1e6c616e4;p=linux-4.9.git

Revert "mm: fix 100% CPU kswapd busyloop on unreclaimable nodes"

This reverts commit 19a7db1e2ef38865a704ea4dfd178b02a8026ada which is
c73322d098e4b6f5f0f0fa1330bf57e218775539 upstream. By adding a new field into
struct pglist_data it changes the ABI. Since the problem doesn't seem to
occur often, revert the change for now.

Gbp-Pq: Topic debian
Gbp-Pq: Name revert-mm-fix-100-CPU-kswapd-busyloop-on-unreclaimab.patch
---

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 65a686a7bf34..1192eb029c5b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -633,8 +633,6 @@ typedef struct pglist_data {
 	int kswapd_order;
 	enum zone_type kswapd_classzone_idx;
 
-	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
-
 #ifdef CONFIG_COMPACTION
 	int kcompactd_max_order;
 	enum zone_type kcompactd_classzone_idx;
diff --git a/mm/internal.h b/mm/internal.h
index 3e2d01694747..34a5459e5989 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -73,12 +73,6 @@ static inline void set_page_refcounted(struct page *page)
 
 extern unsigned long highest_memmap_pfn;
 
-/*
- * Maximum number of reclaim retries without progress before the OOM
- * killer is consider the only way forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
 /*
  * in mm/vmscan.c:
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 94018ea5f935..546713b3f762 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3421,6 +3421,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return false;
 }
 
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
 /*
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
@@ -4379,8 +4385,7 @@ void show_free_areas(unsigned int filter)
 			K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
 			K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 			node_page_state(pgdat, NR_PAGES_SCANNED),
-			pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
-				"yes" : "no");
+			!pgdat_reclaimable(pgdat) ? "yes" : "no");
 	}
 
 	for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f118dc23f662..30a88b945a44 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2606,15 +2606,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 
-	/*
-	 * Kswapd gives up on balancing particular nodes after too
-	 * many failures to reclaim anything from them and goes to
-	 * sleep. On reclaim progress, reset the failure counter. A
-	 * successful direct reclaim run will revive a dormant kswapd.
-	 */
-	if (reclaimable)
-		pgdat->kswapd_failures = 0;
-
 	return reclaimable;
 }
 
@@ -2689,6 +2680,10 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						 GFP_KERNEL | __GFP_HARDWALL))
 				continue;
 
+			if (sc->priority != DEF_PRIORITY &&
+			    !pgdat_reclaimable(zone->zone_pgdat))
+				continue;	/* Let kswapd poll it */
+
 			/*
 			 * If we already have plenty of memory free for
 			 * compaction in this zone, don't free any more.
@@ -2825,7 +2820,7 @@ retry:
 	return 0;
 }
 
-static bool allow_direct_reclaim(pg_data_t *pgdat)
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
 {
 	struct zone *zone;
 	unsigned long pfmemalloc_reserve = 0;
@@ -2833,9 +2828,6 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
 	int i;
 	bool wmark_ok;
 
-	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-		return true;
-
 	for (i = 0; i <= ZONE_NORMAL; i++) {
 		zone = &pgdat->node_zones[i];
 		if (!managed_zone(zone) ||
@@ -2916,7 +2908,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 
 		/* Throttle based on the first usable node */
 		pgdat = zone->zone_pgdat;
-		if (allow_direct_reclaim(pgdat))
+		if (pfmemalloc_watermark_ok(pgdat))
 			goto out;
 		break;
 	}
@@ -2938,14 +2930,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
 	 */
 	if (!(gfp_mask & __GFP_FS)) {
 		wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
-			allow_direct_reclaim(pgdat), HZ);
+			pfmemalloc_watermark_ok(pgdat), HZ);
 
 		goto check_pending;
 	}
 
 	/* Throttle until kswapd wakes the process */
 	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-		allow_direct_reclaim(pgdat));
+		pfmemalloc_watermark_ok(pgdat));
 
 check_pending:
 	if (fatal_signal_pending(current))
@@ -3124,7 +3116,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 
 	/*
 	 * The throttled processes are normally woken up in balance_pgdat() as
-	 * soon as allow_direct_reclaim() is true. But there is a potential
+	 * soon as pfmemalloc_watermark_ok() is true. But there is a potential
 	 * race between when kswapd checks the watermarks and a process gets
 	 * throttled. There is also a potential race if processes get
 	 * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3138,10 +3130,6 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 	if (waitqueue_active(&pgdat->pfmemalloc_wait))
 		wake_up_all(&pgdat->pfmemalloc_wait);
 
-	/* Hopeless node, leave it to direct reclaim */
-	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-		return true;
-
 	for (i = 0; i <= classzone_idx; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
@@ -3228,9 +3216,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 	count_vm_event(PAGEOUTRUN);
 
 	do {
-		unsigned long nr_reclaimed = sc.nr_reclaimed;
 		bool raise_priority = true;
 
+		sc.nr_reclaimed = 0;
 		sc.reclaim_idx = classzone_idx;
 
 		/*
@@ -3309,7 +3297,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * able to safely make forward progress. Wake them
 		 */
 		if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
-				allow_direct_reclaim(pgdat))
+				pfmemalloc_watermark_ok(pgdat))
 			wake_up_all(&pgdat->pfmemalloc_wait);
 
 		/* Check if kswapd should be suspending */
@@ -3320,14 +3308,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		 * Raise priority if scanning rate is too low or there was no
 		 * progress in reclaiming pages
 		 */
-		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
-		if (raise_priority || !nr_reclaimed)
+		if (raise_priority || !sc.nr_reclaimed)
 			sc.priority--;
 	} while (sc.priority >= 1);
 
-	if (!sc.nr_reclaimed)
-		pgdat->kswapd_failures++;
-
 out:
 	/*
 	 * Return the order kswapd stopped reclaiming at as
@@ -3527,10 +3511,6 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
 
-	/* Hopeless node, leave it to direct reclaim */
-	if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-		return;
-
 	/* Only wake kswapd if all zones are unbalanced */
 	for (z = 0; z <= classzone_idx; z++) {
 		zone = pgdat->node_zones + z;
@@ -3801,6 +3781,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
 		return NODE_RECLAIM_FULL;
 
+	if (!pgdat_reclaimable(pgdat))
+		return NODE_RECLAIM_FULL;
+
 	/*
 	 * Do not scan if the allocation should not be delayed.
 	 */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3863b5d6d598..6a088df04b29 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1421,7 +1421,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n  node_unreclaimable:  %u"
 		   "\n  start_pfn:           %lu"
 		   "\n  node_inactive_ratio: %u",
-		   pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
+		   !pgdat_reclaimable(zone->zone_pgdat),
 		   zone->zone_start_pfn,
 		   zone->zone_pgdat->inactive_ratio);
 	seq_putc(m, '\n');